Raw data

- Study summary: Neurological complications are common in patients with COVID-19. While SARS-CoV-2, the causal pathogen of COVID-19, has been detected in some patient brains, its ability to infect brain cells and impact their function are not well understood, and experimental models using human brain cells are urgently needed. Here we investigated the susceptibility of human induced pluripotent stem cell (hiPSC)-derived monolayer brain cells and region-specific brain organoids to SARS-CoV-2 infection. We found modest numbers of infected neurons and astrocytes, but greater infection of choroid plexus epithelial cells. We optimized a protocol to generate choroid plexus organoids from hiPSCs, which revealed productive SARS-CoV-2 infection that leads to increased cell death and transcriptional dysregulation indicative of an inflammatory response and cellular function deficits. Together, our results provide evidence for SARS-CoV-2 neurotropism and support use of hiPSC-derived brain organoids as a platform to investigate the cellular susceptibility, disease mechanisms, and treatment strategies for SARS-CoV-2 infection. Bulk RNA-seq of choroid plexus organoids (CPOs) was performed on mock 72 hours post-infection (hpi), SARS-CoV-2 24 hpi, and SARS-CoV-2 72 hpi samples. All conditions were profiled in triplicate.

Loading packages

library(data.table)
library(rmarkdown)
library(AnnotationHub)
library(tidyverse)
library(tximport)
library(ggplot2)
library(DESeq2)
library(pheatmap)
library(gridExtra)
library(UpSetR)
library(apeglm)
library(ashr)

Setting AnnotationHub

AnnotationSpecies <- "Homo sapiens"  # Assign your species 
ah <- AnnotationHub(hub=getAnnotationHubOption("URL"))   # Bring annotation DB

Running AnnotationHub

# Filter annotation of interest
ahQuery <- query(ah, c("OrgDb", AnnotationSpecies))      

if (length(ahQuery) == 1) {
    DBName <- names(ahQuery)
} else if (length(ahQuery) > 1) {
               DBName <- names(ahQuery)[1]
} else {
    print("You don't have a valid DB")
    rmarkdown::render() 
} 

AnnoDb <- ah[[DBName]] # Store into an OrgDb object  

# Explore your OrgDb object with following accessors:
# columns(AnnpDb)
# keytypes(AnnoDb)
# keys(AnnoDb, keytype=..)
# select(AnnoDb, keys=.., columns=.., keytype=...)
AnnoKey <- keys(AnnoDb, keytype="ENSEMBLTRANS")

# Note: Annotation has to be done with not genome but transcripts 
AnnoDb <- select(AnnoDb, 
                 AnnoKey,
                 keytype="ENSEMBLTRANS",
                 columns="ENSEMBL")

# Check if your AnnoDb has been extracted and saved correctely
class(AnnoDb)
## [1] "data.frame"
head(AnnoDb)
##      ENSEMBLTRANS         ENSEMBL
## 1 ENST00000543404 ENSG00000256069
## 2 ENST00000566278 ENSG00000256069
## 3 ENST00000545343 ENSG00000256069
## 4 ENST00000544183 ENSG00000256069
## 5 ENST00000286479 ENSG00000156006
## 6 ENST00000520116 ENSG00000156006

Defining file name and path for .sf files

.sf files have been created from fastq data by salmon

# This code chunk needs to be written by yourself 
#
# Define sample names 
SampleNames <-  c(paste0("Mock-rep", 1:3), paste0("SARS-CoV-2-rep", 1:3)) 

# Define group level
GroupLevel <- c("Mock", "Covid")

# Define contrast for DE analysis
Contrast <- c("Group", GroupLevel)


# Define a vector for comparing TPM vs Counts effect 
TvC <- c("TPM", "Counts")
levels(TvC) <- TvC

# Define .sf file path
sf <- c(paste0("salmon_output/", 
               SampleNames,
               ".salmon_quant/quant.sf"))

# Define sample groups
group <- c(rep(GroupLevel[1], 3), rep(GroupLevel[2], 3))

# Create metadata
metadata <- data.frame(Sample=factor(SampleNames, levels=SampleNames),
                       Group=factor(group, levels=GroupLevel),
                       Path=sf)

rownames(metadata) <- SampleNames

# Explore the metadata
print(metadata)
##                          Sample Group
## Mock-rep1             Mock-rep1  Mock
## Mock-rep2             Mock-rep2  Mock
## Mock-rep3             Mock-rep3  Mock
## SARS-CoV-2-rep1 SARS-CoV-2-rep1 Covid
## SARS-CoV-2-rep2 SARS-CoV-2-rep2 Covid
## SARS-CoV-2-rep3 SARS-CoV-2-rep3 Covid
##                                                                Path
## Mock-rep1             salmon_output/Mock-rep1.salmon_quant/quant.sf
## Mock-rep2             salmon_output/Mock-rep2.salmon_quant/quant.sf
## Mock-rep3             salmon_output/Mock-rep3.salmon_quant/quant.sf
## SARS-CoV-2-rep1 salmon_output/SARS-CoV-2-rep1.salmon_quant/quant.sf
## SARS-CoV-2-rep2 salmon_output/SARS-CoV-2-rep2.salmon_quant/quant.sf
## SARS-CoV-2-rep3 salmon_output/SARS-CoV-2-rep3.salmon_quant/quant.sf

Converting .sf files to txi list

- txi_tpm: stores TPM with the argument “countsFromAbundance=”lengthScaledTPM"

- txi_counts: stores original counts

- In this project, two txi objects were created with or without the countsFromAbundance=“lengthScaledTPM” argument and compared in downstream DE analysis.

- If you don’t want gene-level summarization, set txOut=TRUE.

- References: tximport doc, DESeq2 doc “Why unnormalized counts?”, Soneson et al. 2016, Developer Dr. Love’s comment, Harvard Chan Bioinformatics Core workshop

# Assign sample names to the input (.sf) file path
names(sf) <- SampleNames

# Run tximport
# tpm vs original counts
# input sf: a factor of all .sf files' path
txi_tpm <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    countsFromAbundance="lengthScaledTPM", # Extracts TPM 
                    ignoreTxVersion=T) 

txi_counts <- tximport(sf, 
                    type="salmon",
                    tx2gene=AnnoDb,
                    ignoreTxVersion=T) 

# tpm 
head(txi_tpm$counts)
##                   Mock-rep1    Mock-rep2  Mock-rep3 SARS-CoV-2-rep1
## ENSG00000000005    7.069840    3.9538769    0.00000        2.004495
## ENSG00000001561 2862.599921 2813.7838311 2505.36278     2443.324902
## ENSG00000002933   15.843325   30.0172780   20.35533       23.846056
## ENSG00000003056 1376.863217 1457.4984469 1408.32364     2236.783582
## ENSG00000003137    6.937593    0.9777292    4.08993        2.006846
## ENSG00000004478  310.042601  359.1166879  241.37580      323.219596
##                 SARS-CoV-2-rep2 SARS-CoV-2-rep3
## ENSG00000000005        0.991949        3.993719
## ENSG00000001561     1120.670894     2064.616537
## ENSG00000002933       16.130760       22.360242
## ENSG00000003056     1261.607594     1345.676809
## ENSG00000003137        0.000000        3.949410
## ENSG00000004478      249.048081      255.754081
dim(txi_tpm$counts)
## [1] 12202     6
# counts
head(txi_counts$counts)
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000005     7.000     4.000     0.000           2.000           1.000
## ENSG00000001561  2889.000  2782.000  2515.000        2498.000        1102.000
## ENSG00000002933    17.000    32.000    18.000          25.000          14.000
## ENSG00000003056  1423.407  1538.976  1321.634        2218.945        1274.869
## ENSG00000003137     7.000     1.000     4.000           2.000           0.000
## ENSG00000004478   277.049   305.781   224.858         378.571         237.257
##                 SARS-CoV-2-rep3
## ENSG00000000005           4.000
## ENSG00000001561        2073.000
## ENSG00000002933          24.000
## ENSG00000003056        1318.894
## ENSG00000003137           4.000
## ENSG00000004478         310.414
dim(txi_counts$counts)
## [1] 12202     6

Creating DESeq objects from txi and VST

- Note: The tximport-to-DESeq2 approach uses estimated gene counts from the transcript abundance quantifiers, but not normalized counts.

- The DESeqDataSetFromTximport() function generated an DESeq object (aka dds) with the txi input.

- vst() was run to perform variance stabilizing transformation instead of rlog() which takes longer time with similar characteristics.

- The vsd object created by vst() is used for not DE analysis but QC.

- References: DESeq2 doc “Transcript abundance files”, DESeq2 doc “Variance stabilizing transformation”

# Set a function creating dds and vsd
dds_vsd_fn <- function(txi) { 

    # Create a DESeq object (so-calledd dds) 
    des <- DESeqDataSetFromTximport(txi, 
                                    colData=metadata,
                                    design=~Group)

    # Create a vsd object (so-called vsd) 
    ves <- vst(des, blind=T)

    # Output them as a list 
    return(list(dds=des, vsd=ves))

}

TPM <- dds_vsd_fn(txi_tpm)
Counts <- dds_vsd_fn(txi_counts)
# Outputs
# dds: TPM/Counts[[1]] or TPM/Counts[['dds']] 
# vsd: TPM/Counts[[2]] or TPM/Counts[['vsd']]

Exploring created dds

# TPM 
TPM[[1]]
## class: DESeqDataSet 
## dim: 12202 6 
## metadata(1): version
## assays(1): counts
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
##   ENSG00000288642
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(TPM[[1]]))
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000005         7         4         0               2               1
## ENSG00000001561      2863      2814      2505            2443            1121
## ENSG00000002933        16        30        20              24              16
## ENSG00000003056      1377      1457      1408            2237            1262
## ENSG00000003137         7         1         4               2               0
## ENSG00000004478       310       359       241             323             249
##                 SARS-CoV-2-rep3
## ENSG00000000005               4
## ENSG00000001561            2065
## ENSG00000002933              22
## ENSG00000003056            1346
## ENSG00000003137               4
## ENSG00000004478             256
# Counts
Counts[[1]]
## class: DESeqDataSet 
## dim: 12202 6 
## metadata(1): version
## assays(2): counts avgTxLength
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
##   ENSG00000288642
## rowData names(0):
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
head(counts(Counts[[1]]))
##                 Mock-rep1 Mock-rep2 Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2
## ENSG00000000005         7         4         0               2               1
## ENSG00000001561      2889      2782      2515            2498            1102
## ENSG00000002933        17        32        18              25              14
## ENSG00000003056      1423      1539      1322            2219            1275
## ENSG00000003137         7         1         4               2               0
## ENSG00000004478       277       306       225             379             237
##                 SARS-CoV-2-rep3
## ENSG00000000005               4
## ENSG00000001561            2073
## ENSG00000002933              24
## ENSG00000003056            1319
## ENSG00000003137               4
## ENSG00000004478             310

Exploring created vsd

# TPM
TPM[[2]]
## class: DESeqTransform 
## dim: 12202 6 
## metadata(1): version
## assays(1): ''
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
##   ENSG00000288642
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
# Counts
Counts[[2]]
## class: DESeqTransform 
## dim: 12202 6 
## metadata(1): version
## assays(1): ''
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
##   ENSG00000288642
## rowData names(4): baseMean baseVar allZero dispFit
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path

Estimating size factors, dispersions, and conducting Wald Test

- Dispersion is calculated as a measure of variation instead of variance since variance gets larger when gene expression gets higher.

- Wald test is the default setting of DESeq2 which tests null hypothesis between two groups. You should use Likelihood ratio test (LRT) when comparing more than two groups.

- Messages when “Counts <- DESeqPrep_fn(Counts)” was run: using ‘avgTxLength’ from assays(dds)

- References: Harvard Chan Bioinformatics Core workshop I, Harvard Chan Bioinformatics Core workshop II, Harvard Chan Bioinformatics Core workshop III, DESeq2 "Wald test indivisual steps, DESeq2 doc “Likelihood ratio test”

# Set a function estimating size factors, dispersions, and perform wald test
DESeqPrep_fn <- function(List) {
    
    List[[1]] <- estimateSizeFactors(List[[1]])
    List[[1]] <- estimateDispersions(List[[1]])
    List[[1]] <- nbinomWaldTest(List[[1]])
   
    return(List)
}
# Update dds with the function
Counts <- DESeqPrep_fn(Counts) 
TPM <- DESeqPrep_fn(TPM)

Exploring size factors

sizeFactors(Counts[[1]])
## NULL
sizeFactors(TPM[[1]])
##       Mock-rep1       Mock-rep2       Mock-rep3 SARS-CoV-2-rep1 SARS-CoV-2-rep2 
##       1.1074118       1.0517854       0.8854318       1.3678281       0.7650195 
## SARS-CoV-2-rep3 
##       1.0042626
# Size factors don't exist in the Counts dds!
# Normalization factors are calculated in the Counts dds instead! 
assays(Counts[[1]])
## List of length 6
## names(6): counts avgTxLength normalizationFactors mu H cooks
assays(TPM[[1]])
## List of length 4
## names(4): counts mu H cooks
colData(Counts[[1]])
## DataFrame with 6 rows and 3 columns
##                          Sample    Group                   Path
##                        <factor> <factor>            <character>
## Mock-rep1       Mock-rep1          Mock  salmon_output/Mock-r..
## Mock-rep2       Mock-rep2          Mock  salmon_output/Mock-r..
## Mock-rep3       Mock-rep3          Mock  salmon_output/Mock-r..
## SARS-CoV-2-rep1 SARS-CoV-2-rep1    Covid salmon_output/SARS-C..
## SARS-CoV-2-rep2 SARS-CoV-2-rep2    Covid salmon_output/SARS-C..
## SARS-CoV-2-rep3 SARS-CoV-2-rep3    Covid salmon_output/SARS-C..
colData(TPM[[1]])
## DataFrame with 6 rows and 4 columns
##                          Sample    Group                   Path sizeFactor
##                        <factor> <factor>            <character>  <numeric>
## Mock-rep1       Mock-rep1          Mock  salmon_output/Mock-r..   1.107412
## Mock-rep2       Mock-rep2          Mock  salmon_output/Mock-r..   1.051785
## Mock-rep3       Mock-rep3          Mock  salmon_output/Mock-r..   0.885432
## SARS-CoV-2-rep1 SARS-CoV-2-rep1    Covid salmon_output/SARS-C..   1.367828
## SARS-CoV-2-rep2 SARS-CoV-2-rep2    Covid salmon_output/SARS-C..   0.765019
## SARS-CoV-2-rep3 SARS-CoV-2-rep3    Covid salmon_output/SARS-C..   1.004263

Plotting the size factors in TPM

- The size factors are only available from TPM dds

- Blue dashed line: normalization factor = 1

# Extract and save the size factors in a data frame
sizeFactor <- as.data.frame(round(sizeFactors(TPM[[1]]), 3))

colnames(sizeFactor) <- 'Size_Factor'

sizeFactor <- sizeFactor %>%
    rownames_to_column(var="Sample") %>%
    inner_join(metadata[, 1:ncol(metadata)-1], by="Sample") 

sizeFactor$Sample <- factor(sizeFactor$Sample, levels=SampleNames)



# Create a plot comparing the size factors by sample
ggplot(sizeFactor, aes(x=Sample, 
                       y=Size_Factor, 
                       fill=Group,
                       label=Size_Factor)) +
    geom_bar(stat="identity", width=0.8) +
    theme_bw() + 
    ggtitle("Size Factors in TPM-DESeq") +
    geom_text(vjust=1.5) +
    theme(axis.text.x=element_text(angle=45, 
                                   vjust=0.5)) + ylab("Size Factor") + geom_hline(yintercept=1, color="blue", linetype="dashed")

Plotting nornalization factors in the Counts

- DESeq2 performs an internal normalization where geometric mean is calculated for each gene across all samples. The counts for a gene in each sample is then divided by this mean. The median of these ratios in a sample is the size factor for that sample.

- Blue dashed line: normalization factor = 1

- Colored dots: normlization factors per gene (y-axis) in each sample

- Box plots: distribution of the normalization facters in each group (see the second plot)

- Reference: DESeq2 doc “Sample-/gene-dependent normalization factors”

# Extract and normalization factors in a data frame
normf <- as.data.frame(normalizationFactors(Counts[[1]])) %>%
    gather(Sample, Normalization_Factor) %>%
    inner_join(metadata[, 1:2], by="Sample") 
normf$Sample <- factor(normf$Sample, levels=SampleNames)
normf$Group <- factor(normf$Group, levels=GroupLevel)
# Create a scatter plot showing distribution of normalization factors 
normFactors_plot <- ggplot(normf, 
       aes(x=Sample, y=Normalization_Factor)) + 
geom_jitter(alpha=0.5, aes(color=Group)) + 
# Add a boxplot to provide statistics in each sample
geom_boxplot(aes(x=Sample, y=Normalization_Factor), 
             outlier.shape=NA, alpha=0.5) + 
theme_bw() +
ggtitle("Normalization Factors in Counts-DESeq") +
theme(axis.text.x=element_text(angle=45, 
                               vjust=0.5)) + 
ylab("Normalization Factor / Gene") +
# Add a dashed horizontal line to indicate where normalization factor=1
geom_hline(yintercept=1, 
           color="blue", 
           linetype="dashed")
# Print the normalization factor scatter plot 
print(normFactors_plot)

# Print the same plot with larger y-magnification in order to observe the box plot 
normFactors_plot + 
    ylim(0.5, 1.5)

Setting functions for QC plots

- Reference: DESeq2 doc “Principal component plot of the samples”, DESeq2 doc “Heatmap of the sample-to-sample distances”

# Assigne what to compare
GroupOfInterest <- Contrast[1] 

# Set a function for a PCA plot
QCPCA_fn <- function(inputList, Title) {

    plotPCA(inputList[[2]],    # takes vsd
            intgroup=GroupOfInterest) + theme_bw() + ggtitle(Title)
}

# Set heatmap annotation 
ColOfInterest <- !colnames(metadata) %in% c("Sample", "Path")
HeatmapAnno <- as.data.frame(metadata[, ColOfInterest])
rownames(HeatmapAnno) <- SampleNames
colnames(HeatmapAnno) <- colnames(metadata)[ColOfInterest]

# Set a function for a correlation heatmap 
QCcorrHeatmap_fn <- function(inputList, Title) {

    # Extract transformed count matrix
    mtx <- assay(inputList[[2]])      # takes vsd

    # Calculate correlation and store in the matrix
    mtx <- cor(mtx)
    
    # Create a correlation heatmap
    return(pheatmap(mtx, 
             annotation=HeatmapAnno,
             main=paste("Sample Correlation Heatmap:",
                        Title)))
}

Sample QC: Principal Component Analysis (PCA)

- Checkpoints in PCA: source of variation, sample outlier

grid.arrange(QCPCA_fn(TPM, "QC PCA: TPM"), 
             QCPCA_fn(Counts, "QC PCA: Counts"), 
             nrow=2)

Sample QC: Sample Correlation Heatmap

- Checkpoints of correlation heatmap: distance between samples, correlation in a group

- Upper: TPM input

- Lower: Counts input

# TPM
QCcorrHeatmap_fn(TPM, "TPM") 

# Counts
QCcorrHeatmap_fn(Counts, "Counts") 

Running DE analysis with or without shrinkage

- Shrinkage reduces false positives

- Magnitude of shrinkage is affected by dispersion and sample size

- When the argument type is set to “apeglm”, the coef argument is used instead of constrast. In this dataset, you can set coef=Coef where Coef <- resultsNames(ddsList[[1]]).

- When the type is set to “normal”, the argument contrast is set as shown below.

- References: DESeq2 doc “Alternative shrinkage estimators”, Harvard Chan Bioinformatics Core workshop

# Create a list consisted with dds objects from TPM and Counts
desList <- list(TPM[[1]], Counts[[1]]) 
names(desList) <- TvC

# Create a list for TPM and Counts dds 
ddsList <- desList  # DE without shrinkage  
normal.ddsList <- desList    # DE with "normal" shrinkage
ape.ddsList <- desList       # DE with "apeglm" shrinkage
ash.ddsList <- desList       # DE with "ashr" shrinkage

for (x in TvC) {
    
    # Run DESeq() 
    ddsList[[x]] <- DESeq(desList[[x]])
    print(resultsNames(ddsList[[x]]))

    normal.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                                contrast=Contrast,
                                type="normal")

    ape.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="apeglm")

    ash.ddsList[[x]] <- lfcShrink(ddsList[[x]],
                             coef=resultsNames(ddsList[[x]])[2],
                             type="ashr")

}
## [1] "Intercept"           "Group_Covid_vs_Mock"
## [1] "Intercept"           "Group_Covid_vs_Mock"
# Combine every ddsList into a list
all.ddsList <- list(ddsList, normal.ddsList, ape.ddsList, ash.ddsList)
shrinkage <- c("None", "Normal", "Apeglm", "Ashr")
names(all.ddsList) <- shrinkage

Creating dispersion plots

- Dispersion is important since estimation by DESeq2 algorithm is based on the assumption that genes with similar expression levels have similar dispersion. If an RNA-seq dataset doesn’t satisfy this assumption, use other DE algorithms than DESeq2.

- References: DESeq2 doc "Dispersion plot and fitting alternatives, Harvard Chan Bioinformatics Core workshop

# Plot dispersion  
for (x in TvC) {

    plotDispEsts(ddsList[[x]], 
                 ylab="Dispersion", 
                 xlab="Mean of Normalized Counts", 
                 main=paste("Dispersion of", x, "Input"))
}

Extracting DE results with or without shrinkage

- The alpha denotes threshold of false discovery rate (FDR) assigned by users.

- In this analysis, the alpha is set to 0.1

# Set FDR threshold 
alpha=0.1 

# FDR threshold vector
FDRv=c("< 0.1", "> 0.1") 

# Initialize lists of result tables 
all.resList <- all.ddsList 

# Set a function cleaning table
Sig.fn <- function(df, Input) {
    
    df <- df %>% 
        rownames_to_column(var="Gene") %>%
        mutate(FDR=ifelse(padj < 0.1 & !is.na(padj), 
                                   FDRv[1], 
                                   FDRv[2]), 
               Input=Input) 
    return(df)
}




for (i in shrinkage) {

    if (i == "None") {

        for (x in TvC) {

        # Extract data frames from unshrunken lfc & clean data 
        all.resList[[i]][[x]] <- as.data.frame(results(all.ddsList[[i]][[x]], 
                                                       contrast=Contrast, 
                                                       alpha=alpha)) %>% 
        Sig.fn(x)

         } 
    } else {

        # Extract data frames from shrunken lfc & clean data
        for (x in TvC) {

            all.resList[[i]][[x]] <- as.data.frame(all.ddsList[[i]][[x]]) %>%
                Sig.fn(x)
    

        }
    }
}





# Explore the results 
summary(all.resList)
##        Length Class  Mode
## None   2      -none- list
## Normal 2      -none- list
## Apeglm 2      -none- list
## Ashr   2      -none- list
head(all.resList[[1]][['TPM']])
##              Gene    baseMean log2FoldChange     lfcSE       stat       pvalue
## 1 ENSG00000000005    2.812742      0.5987721 1.6141938  0.3709419 7.106808e-01
## 2 ENSG00000001561 2232.914457      0.6069591 0.1459519  4.1586231 3.201717e-05
## 3 ENSG00000002933   20.987678      0.1216964 0.5714864  0.2129472 8.313682e-01
## 4 ENSG00000003056 1474.040931     -0.1332132 0.1492349 -0.8926414 3.720493e-01
## 5 ENSG00000003137    2.872429      1.0848946 1.6061178  0.6754764 4.993732e-01
## 6 ENSG00000004478  284.996011      0.1359892 0.2062583  0.6593152 5.096934e-01
##          padj   FDR Input
## 1          NA > 0.1   TPM
## 2 0.000713163 < 0.1   TPM
## 3 0.962417071 > 0.1   TPM
## 4 0.760029654 > 0.1   TPM
## 5          NA > 0.1   TPM
## 6 0.850964349 > 0.1   TPM
head(all.resList[[1]][['Counts']])
##              Gene    baseMean log2FoldChange     lfcSE       stat       pvalue
## 1 ENSG00000000005    2.856685      0.6047562 1.5942774  0.3793293 7.044433e-01
## 2 ENSG00000001561 2267.273729      0.6080151 0.1463827  4.1536009 3.272838e-05
## 3 ENSG00000002933   21.348392      0.1187991 0.5674509  0.2093557 8.341706e-01
## 4 ENSG00000003056 1495.567472     -0.1320311 0.1496493 -0.8822703 3.776306e-01
## 5 ENSG00000003137    2.911904      1.0944043 1.5875474  0.6893680 4.905917e-01
## 6 ENSG00000004478  286.986305      0.1409851 0.2055848  0.6857757 4.928546e-01
##           padj   FDR  Input
## 1           NA > 0.1 Counts
## 2 0.0006904505 < 0.1 Counts
## 3 0.9589769953 > 0.1 Counts
## 4 0.7522981052 > 0.1 Counts
## 5           NA > 0.1 Counts
## 6 0.8353881362 > 0.1 Counts
head(all.resList[[2]][['TPM']])
##              Gene    baseMean log2FoldChange     lfcSE       stat       pvalue
## 1 ENSG00000000005    2.812742     0.13169158 0.3552874  0.3709419 7.106808e-01
## 2 ENSG00000001561 2232.914457     0.58983548 0.1418306  4.1586231 3.201717e-05
## 3 ENSG00000002933   20.987678     0.08415521 0.3954220  0.2129472 8.313682e-01
## 4 ENSG00000003056 1474.040931    -0.12928703 0.1448374 -0.8926414 3.720493e-01
## 5 ENSG00000003137    2.872429     0.23953496 0.3578170  0.6754764 4.993732e-01
## 6 ENSG00000004478  284.996011     0.12852465 0.1949507  0.6593152 5.096934e-01
##          padj   FDR Input
## 1          NA > 0.1   TPM
## 2 0.000713163 < 0.1   TPM
## 3 0.962417071 > 0.1   TPM
## 4 0.760029654 > 0.1   TPM
## 5          NA > 0.1   TPM
## 6 0.850964349 > 0.1   TPM
head(all.resList[[2]][['Counts']])
##              Gene    baseMean log2FoldChange     lfcSE       stat       pvalue
## 1 ENSG00000000005    2.856685      0.1356995 0.3580169  0.3793293 7.044433e-01
## 2 ENSG00000001561 2267.273729      0.5907791 0.1422291  4.1536009 3.272838e-05
## 3 ENSG00000002933   21.348392      0.0825164 0.3944490  0.2093557 8.341706e-01
## 4 ENSG00000003056 1495.567472     -0.1281219 0.1452198 -0.8822703 3.776306e-01
## 5 ENSG00000003137    2.911904      0.2463960 0.3604316  0.6893680 4.905917e-01
## 6 ENSG00000004478  286.986305      0.1332946 0.1943962  0.6857757 4.928546e-01
##           padj   FDR  Input
## 1           NA > 0.1 Counts
## 2 0.0006904505 < 0.1 Counts
## 3 0.9589769953 > 0.1 Counts
## 4 0.7522981052 > 0.1 Counts
## 5           NA > 0.1 Counts
## 6 0.8353881362 > 0.1 Counts

Exploring mean-difference relationship with MA plots

- x-axis: expression level (baseMean))

- y-axis: fold change (log2FoldChange)

- Red dashed lines: log2FoldChange = -1 and 1

- Reference: DESeq2 doc “MA-plot”

# Set ylim: has to adjusted by users depending on data 
yl <- c(-25, 25)

# Set min log2 fold change of interest 
mLog <- c(-1, 1)

# Initialize a list storing MA plots
MAList <- ddsList


# Create MA plots

for (i in shrinkage) {

    both.df <- rbind(all.resList[[i]][[TvC[1]]], 
                     all.resList[[i]][[TvC[2]]])

    MAList[[i]] <- ggplot(both.df, 
                              aes(x=baseMean, y=log2FoldChange, color=FDR))  +geom_point() + scale_x_log10() + facet_grid(~Input) + 
                                   theme_bw() + 
                                   scale_color_manual(values=c("blue", "grey")) +
                                   ggtitle(paste("MA plot with", i)) +
                                   ylim(yl[1], yl[2]) + 
                                   geom_hline(yintercept=c(mLog[1], mLog[2]), linetype="dashed", color="red") 

}

   

# Print MA plots
MAList
## $TPM
## class: DESeqDataSet 
## dim: 12202 6 
## metadata(1): version
## assays(4): counts mu H cooks
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
##   ENSG00000288642
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(4): Sample Group Path sizeFactor
## 
## $Counts
## class: DESeqDataSet 
## dim: 12202 6 
## metadata(1): version
## assays(6): counts avgTxLength ... H cooks
## rownames(12202): ENSG00000000005 ENSG00000001561 ... ENSG00000288631
##   ENSG00000288642
## rowData names(22): baseMean baseVar ... deviance maxCooks
## colnames(6): Mock-rep1 Mock-rep2 ... SARS-CoV-2-rep2 SARS-CoV-2-rep3
## colData names(3): Sample Group Path
## 
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

Exploring distribution of false discovery rate (FDR)

- Distribution of adjusted p-val (FDR) was presented

- x-axis: FDR

- y-axis: Number of genes

- Black dashed line: FDR = alpha

# Subset rows with FDR < alpha
both.df <- rbind(all.resList[[1]][['TPM']], 
                 all.resList[[1]][['Counts']])
both.df$Input <- factor(both.df$Input, levels=TvC)

# Plot distribution of FDR
ggplot(both.df,
       aes(x=padj, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() +
ggtitle("Distribution of False Discovery Rate (FDR)") + 
xlab("Adjusted P-Value (FDR)") + 
ylab("Count") + 
geom_vline(xintercept=alpha, 
           color="black", 
           linetype="dashed",
           size=1) + 
scale_x_continuous(breaks=seq(0, 1, by=0.1)) 

Exploring distribution of log2FoldChange by input type

- Black dashed lines: log2FoldChange = -1 and 1

- x-axis: gene expression level (log2FoldChange)

- y-axis: number of genes

# Initialize a lfc list
lfcplotList <- all.resList 

# Create lfc distribution plots
for (i in shrinkage) {

    lfc.df <- rbind(all.resList[[i]][[TvC[1]]], 
                    all.resList[[i]][[TvC[2]]])

    lfc.df <- lfc.df[lfc.df$FDR == "< 0.1",]

    lfc.df$Input <- factor(lfc.df$Input, levels=TvC)

    lfcplotList[[i]] <- ggplot(lfc.df,  # Subset rows with FDR < alpha
                               aes(x=log2FoldChange, color=Input)) + 
geom_density(size=1, aes(y=..count..)) + 
theme_bw() + ylab("Count") + 
geom_vline(xintercept=c(mLog[1], mLog[2]), 
           color="black", 
           linetype="dashed", 
           size=1) + 
ggtitle(paste("Distribution of Log2FoldChange by Input Type:", i)) + 
xlim(-8, 8)


}

# Print the lfc plots
lfcplotList
## $None

## 
## $Normal

## 
## $Apeglm

## 
## $Ashr

NA statistics: zero count genes & outlier genes

When NAs appear in

- log2FoldChange: zero counts in all samples

- padj: too little information

- pval & padj: at least one replicate was an outlier

# Count number of NA genes  
type=c("Zero Counts", "Outliers", "Total NA Genes") 


# Create a data frame storing NA gene number
NAstat <- both.df %>%
    group_by(Input) %>%
    summarize(zero=sum(is.na(log2FoldChange)), 
              outlier=sum(is.na(pvalue) & is.na(padj))) %>%
    mutate(total=zero + outlier) %>%
    gather(Type, Number, -Input) %>%
    mutate(Type=factor(case_when(Type == "zero" ~ type[1], 
                                 Type == "outlier" ~ type[2], 
                                 Type == "total" ~ type[3]), 
                       levels=type))

# Plot number of NA genes 
ggplot(NAstat, 
       aes(x=Type, y=Number, group=Input, fill=Input, label=Number)) + 
    geom_bar(stat="identity", position="dodge") + 
    theme_bw() +
    geom_text(position=position_dodge(width=1), vjust=1.5) + 
    ggtitle("Number of NA Genes") + 
    ylab("Number of Genes")

Ranking DEGs with the TPM and original count inputs

- fdr.rank: ranked by FDR

- lfc.rank: ranked by absolute fold change

- up.lfc.rank: ranked by magnitude of fold change increase

- down.lfc.rank: ranked by manitude of fold change decrease

# Create a new list having DE table with FDR below alpha
fdr.rank <- all.resList
lfc.rank <- all.resList
up.lfc.rank <- all.resList
down.lfc.rank <- all.resList

# Set a function cleaning a data frame 
filter.fdr.fn <- function(df) {as.data.table(df[df$FDR == FDRv[1],])}

# Set a function creating a column for the rank
Ranking.fn <- function(x) {mutate(x, Rank=1:nrow(x))}


for (i in shrinkage) {

    for (x in TvC) {

        rankdf <- all.resList[[i]][[x]]

        fdr.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(padj) %>% Ranking.fn() 

        lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% arrange(desc(abs(log2FoldChange))) %>% Ranking.fn()

        up.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>% 
            arrange(desc(log2FoldChange)) %>% 
            Ranking.fn()

        down.lfc.rank[[i]][[x]] <- filter.fdr.fn(rankdf) %>%
            arrange(log2FoldChange) %>%
            Ranking.fn()

    }
}

# Explore the ranking outputs
head(fdr.rank[[1]][[1]])
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000118523  3782.0311      -3.485938 0.1962458 -17.76312 1.364396e-70
## 2: ENSG00000181104  2461.7802      -1.955899 0.1228322 -15.92334 4.364442e-57
## 3: ENSG00000163814  1501.3368      -2.750119 0.1924159 -14.29258 2.434347e-46
## 4: ENSG00000211455 11847.3131      -1.939889 0.1370266 -14.15703 1.689964e-45
## 5: ENSG00000163762  1897.0253      -2.032626 0.1454228 -13.97735 2.143160e-44
## 6: ENSG00000138271   526.3565      -3.596864 0.2642291 -13.61267 3.367067e-42
##            padj   FDR Input Rank
## 1: 4.984140e-67 < 0.1   TPM    1
## 2: 7.971653e-54 < 0.1   TPM    2
## 3: 2.964223e-43 < 0.1   TPM    3
## 4: 1.543360e-42 < 0.1   TPM    4
## 5: 1.565793e-41 < 0.1   TPM    5
## 6: 2.049982e-39 < 0.1   TPM    6
head(fdr.rank[[1]][[2]])
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000118523  3838.9322      -3.484644 0.1958885 -17.78892 8.612446e-71
## 2: ENSG00000181104  2499.2766      -1.954952 0.1237453 -15.79819 3.201777e-56
## 3: ENSG00000163814  1523.3582      -2.749530 0.1921129 -14.31205 1.840047e-46
## 4: ENSG00000211455 12017.3314      -1.938785 0.1377660 -14.07303 5.563266e-45
## 5: ENSG00000163762  1920.3664      -2.031763 0.1458131 -13.93402 3.935649e-44
## 6: ENSG00000138271   533.8481      -3.595748 0.2628070 -13.68208 1.299139e-42
##            padj   FDR  Input Rank
## 1: 3.016079e-67 < 0.1 Counts    1
## 2: 5.606311e-53 < 0.1 Counts    2
## 3: 2.147949e-43 < 0.1 Counts    3
## 4: 4.870639e-42 < 0.1 Counts    4
## 5: 2.756528e-41 < 0.1 Counts    5
## 6: 7.582642e-40 < 0.1 Counts    6
head(lfc.rank[[1]][[1]])
##               Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000244398 15.032297     -20.412256 3.9097462 -5.220865 1.780897e-07
## 2: ENSG00000248167 14.754788      -7.337706 1.3595099 -5.397317 6.764473e-08
## 3: ENSG00000198796 46.260638      -6.563202 1.5071724 -4.354646 1.332824e-05
## 4: ENSG00000178776  7.011544      -6.267343 1.6282173 -3.849206 1.185015e-04
## 5: ENSG00000263647  6.032185       6.052644 1.6127242  3.753056 1.746919e-04
## 6: ENSG00000205277 31.318717      -5.964592 0.9955208 -5.991429 2.080048e-09
##            padj   FDR Input Rank
## 1: 7.477720e-06 < 0.1   TPM    1
## 2: 3.013490e-06 < 0.1   TPM    2
## 3: 3.267655e-04 < 0.1   TPM    3
## 4: 2.153661e-03 < 0.1   TPM    4
## 5: 2.927291e-03 < 0.1   TPM    5
## 6: 1.225551e-07 < 0.1   TPM    6
head(lfc.rank[[1]][[2]])
##               Gene  baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000244398 15.382334     -20.039517 3.9097199 -5.125563 2.966494e-07
## 2: ENSG00000198796 44.976447      -7.660505 1.3185827 -5.809651 6.260322e-09
## 3: ENSG00000248167 14.908009      -7.331863 1.3535326 -5.416835 6.066326e-08
## 4: ENSG00000178776  7.107135      -6.265639 1.6141288 -3.881747 1.037088e-04
## 5: ENSG00000263647  6.125926       6.056916 1.5988185  3.788370 1.516389e-04
## 6: ENSG00000205277 19.783206      -5.933502 0.9114866 -6.509697 7.530253e-11
##            padj   FDR  Input Rank
## 1: 1.141611e-05 < 0.1 Counts    1
## 2: 3.272186e-07 < 0.1 Counts    2
## 3: 2.590765e-06 < 0.1 Counts    3
## 4: 1.806906e-03 < 0.1 Counts    4
## 5: 2.516775e-03 < 0.1 Counts    5
## 6: 5.860210e-09 < 0.1 Counts    6
head(up.lfc.rank[[1]][[1]])
##               Gene  baseMean log2FoldChange    lfcSE     stat       pvalue
## 1: ENSG00000263647  6.032185       6.052644 1.612724 3.753056 1.746919e-04
## 2: ENSG00000271723 19.767862       5.204963 1.040490 5.002416 5.661633e-07
## 3: ENSG00000286053  5.058302       4.795716 1.832224 2.617428 8.859512e-03
## 4: ENSG00000261408  5.481395       4.148588 1.541464 2.691330 7.116780e-03
## 5: ENSG00000251177  8.960407       3.761525 1.093682 3.439321 5.831748e-04
## 6: ENSG00000205111  8.949362       3.715642 1.141438 3.255230 1.133007e-03
##            padj   FDR Input Rank
## 1: 2.927291e-03 < 0.1   TPM    1
## 2: 2.150265e-05 < 0.1   TPM    2
## 3: 7.035608e-02 < 0.1   TPM    3
## 4: 5.922004e-02 < 0.1   TPM    4
## 5: 8.131059e-03 < 0.1   TPM    5
## 6: 1.406225e-02 < 0.1   TPM    6
head(up.lfc.rank[[1]][[2]])
##               Gene  baseMean log2FoldChange     lfcSE     stat       pvalue
## 1: ENSG00000263647  6.125926       6.056916 1.5988185 3.788370 1.516389e-04
## 2: ENSG00000271723 20.267804       4.781448 0.9646888 4.956467 7.178645e-07
## 3: ENSG00000251177  9.087309       3.767220 1.0849146 3.472366 5.158934e-04
## 4: ENSG00000205002 15.239188       3.413080 0.8618122 3.960353 7.483913e-05
## 5: ENSG00000205111  8.562319       3.400694 1.0772793 3.156744 1.595416e-03
## 6: ENSG00000105641 14.467906       3.174780 0.8040172 3.948647 7.859416e-05
##            padj   FDR  Input Rank
## 1: 2.516775e-03 < 0.1 Counts    1
## 2: 2.539355e-05 < 0.1 Counts    2
## 3: 6.975516e-03 < 0.1 Counts    3
## 4: 1.386702e-03 < 0.1 Counts    4
## 5: 1.756964e-02 < 0.1 Counts    5
## 6: 1.435966e-03 < 0.1 Counts    6
head(down.lfc.rank[[1]][[1]])
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000244398  15.032297     -20.412256 3.9097462 -5.220865 1.780897e-07
## 2: ENSG00000248167  14.754788      -7.337706 1.3595099 -5.397317 6.764473e-08
## 3: ENSG00000198796  46.260638      -6.563202 1.5071724 -4.354646 1.332824e-05
## 4: ENSG00000178776   7.011544      -6.267343 1.6282173 -3.849206 1.185015e-04
## 5: ENSG00000205277  31.318717      -5.964592 0.9955208 -5.991429 2.080048e-09
## 6: ENSG00000137673 134.059247      -5.337356 1.1053841 -4.828508 1.375598e-06
##            padj   FDR Input Rank
## 1: 7.477720e-06 < 0.1   TPM    1
## 2: 3.013490e-06 < 0.1   TPM    2
## 3: 3.267655e-04 < 0.1   TPM    3
## 4: 2.153661e-03 < 0.1   TPM    4
## 5: 1.225551e-07 < 0.1   TPM    5
## 6: 4.696319e-05 < 0.1   TPM    6
head(down.lfc.rank[[1]][[2]])
##               Gene   baseMean log2FoldChange     lfcSE      stat       pvalue
## 1: ENSG00000244398  15.382334     -20.039517 3.9097199 -5.125563 2.966494e-07
## 2: ENSG00000198796  44.976447      -7.660505 1.3185827 -5.809651 6.260322e-09
## 3: ENSG00000248167  14.908009      -7.331863 1.3535326 -5.416835 6.066326e-08
## 4: ENSG00000178776   7.107135      -6.265639 1.6141288 -3.881747 1.037088e-04
## 5: ENSG00000205277  19.783206      -5.933502 0.9114866 -6.509697 7.530253e-11
## 6: ENSG00000137673 136.045521      -5.333046 1.1047964 -4.827175 1.384833e-06
##            padj   FDR  Input Rank
## 1: 1.141611e-05 < 0.1 Counts    1
## 2: 3.272186e-07 < 0.1 Counts    2
## 3: 2.590765e-06 < 0.1 Counts    3
## 4: 1.806906e-03 < 0.1 Counts    4
## 5: 5.860210e-09 < 0.1 Counts    5
## 6: 4.369085e-05 < 0.1 Counts    6

Calculating rank difference

# Set a function rebuilding DE tables with gene ranks 
rankdiff.fn <- function(List){

    # Select columns and join the data frames by gene
    full_join(List[[TvC[1]]][, .(Gene, Input, Rank, baseMean)], 
              List[[TvC[2]]][, .(Gene, Input, Rank, baseMean)], 
              by="Gene") %>%
    
    # Add columns assining gene expression levels, rank differences, and mean ranks
    mutate(logMeanExpression=log(baseMean.x+baseMean.y/2),
           RankDiff=Rank.x-Rank.y, 
           MeanRank=(Rank.x+Rank.y)/2)
} 

# Set a function adding Shrinkage column 
add.shr.fn <- function(df, shr) {mutate(df, Shrinkage=shr)} 

# Set a function rbinding multiple data frames 
rbinds.fn <- function(List) {rbind(List[[1]], 
                                   List[[2]], 
                                   List[[3]], 
                                   List[[4]])}



# Calculate and plot rank difference
for (i in shrinkage) {

    # Calculate rank difference
    fdr.rank[[i]] <- rankdiff.fn(fdr.rank[[i]]) %>% add.shr.fn(i)
    lfc.rank[[i]] <- rankdiff.fn(lfc.rank[[i]]) %>% add.shr.fn(i)
    up.lfc.rank[[i]] <- rankdiff.fn(up.lfc.rank[[i]]) %>% add.shr.fn(i)
    down.lfc.rank[[i]] <- rankdiff.fn(down.lfc.rank[[i]]) %>% add.shr.fn(i) 

}

# Create combined data frames across the shrinkages 
fdr.rank.df <- rbinds.fn(fdr.rank) 
lfc.rank.df <- rbinds.fn(lfc.rank)
up.lfc.rank.df <- rbinds.fn(up.lfc.rank)
down.lfc.rank.df <- rbinds.fn(down.lfc.rank)



# Explore the ranking outputs
head(fdr.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000118523     TPM      1  3782.0311  Counts      1  3838.9322
## 2: ENSG00000181104     TPM      2  2461.7802  Counts      2  2499.2766
## 3: ENSG00000163814     TPM      3  1501.3368  Counts      3  1523.3582
## 4: ENSG00000211455     TPM      4 11847.3131  Counts      4 12017.3314
## 5: ENSG00000163762     TPM      5  1897.0253  Counts      5  1920.3664
## 6: ENSG00000138271     TPM      6   526.3565  Counts      6   533.8481
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          8.648484        0        1      None
## 2:          8.219169        0        2      None
## 3:          7.724454        0        3      None
## 4:          9.790094        0        4      None
## 5:          7.957600        0        5      None
## 6:          6.676177        0        6      None
head(lfc.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.032297  Counts      1  15.382334
## 2: ENSG00000248167     TPM      2  14.754788  Counts      3  14.908009
## 3: ENSG00000198796     TPM      3  46.260638  Counts      2  44.976447
## 4: ENSG00000178776     TPM      4   7.011544  Counts      4   7.107135
## 5: ENSG00000263647     TPM      5   6.032185  Counts      5   6.125926
## 6: ENSG00000205277     TPM      6  31.318717  Counts      6  19.783206
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          3.123398        0      1.0      None
## 2:          3.100488       -1      2.5      None
## 3:          4.230460        1      2.5      None
## 4:          2.357557        0      4.0      None
## 5:          2.207741        0      5.0      None
## 6:          3.718689        0      6.0      None
head(up.lfc.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000263647     TPM      1   6.032185  Counts      1   6.125926
## 2: ENSG00000271723     TPM      2  19.767862  Counts      2  20.267804
## 3: ENSG00000286053     TPM      3   5.058302    <NA>     NA         NA
## 4: ENSG00000261408     TPM      4   5.481395    <NA>     NA         NA
## 5: ENSG00000251177     TPM      5   8.960407  Counts      3   9.087309
## 6: ENSG00000205111     TPM      6   8.949362  Counts      5   8.562319
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          2.207741        0      1.0      None
## 2:          3.397917        0      2.0      None
## 3:                NA       NA       NA      None
## 4:                NA       NA       NA      None
## 5:          2.602991        2      4.0      None
## 6:          2.582526        1      5.5      None
head(down.lfc.rank.df)
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.032297  Counts      1  15.382334
## 2: ENSG00000248167     TPM      2  14.754788  Counts      3  14.908009
## 3: ENSG00000198796     TPM      3  46.260638  Counts      2  44.976447
## 4: ENSG00000178776     TPM      4   7.011544  Counts      4   7.107135
## 5: ENSG00000205277     TPM      5  31.318717  Counts      5  19.783206
## 6: ENSG00000137673     TPM      6 134.059247  Counts      6 136.045521
##    logMeanExpression RankDiff MeanRank Shrinkage
## 1:          3.123398        0      1.0      None
## 2:          3.100488       -1      2.5      None
## 3:          4.230460        1      2.5      None
## 4:          2.357557        0      4.0      None
## 5:          3.718689        0      5.0      None
## 6:          5.308674        0      6.0      None

Visualizing DEG ranks I: TPM- vs Counts-input

- x-axis: rank with TPM input

- y-axis: rank with Counts input

- Black diagonal lines: rank with TPM = rank with Counts

- Dot color: gene expression level (log-baseMean)

# Set a function plotting gene ranks between TPM- (x-axis) and Counts-Inputs (y-axis)
ranking.plot.fn <- function(df, rankedby) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, 
           aes(x=Rank.x, y=Rank.y, color=logMeanExpression)) + geom_point(alpha=0.5) + facet_grid(~Shrinkage) + theme_bw() + theme(strip.text.x=element_text(size=10)) + xlab("Rank with TPM") + ylab("Rank with Counts") + geom_abline(slope=1, color="black", size=0.5) + ggtitle(paste(rankedby, "Ranking with TPM vs Count Inputs")) + scale_color_gradient(low="blue", high="red") 
}

# Print output plots
ranking.plot.fn(fdr.rank.df, "FDR")

ranking.plot.fn(lfc.rank.df, "Log2FoldChange")

ranking.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)")

ranking.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)")

Visualizing DEG ranks II: Rank difference

- x-axis: expression level (log-baseMean)

- y-axis: rank difference (rank with TPM - rank with Counts)

- Black horizontal lines: rank with TPM = rank with Counts

- Dot color: average rank

# Set a function plotting the rank difference over the gene expression level
rankdiff.plot.fn <- function(df, rankedby, ylim) {

    df$Shrinkage <- factor(df$Shrinkage, levels=shrinkage)

    ggplot(df, aes(x=logMeanExpression, y=RankDiff, color=MeanRank)) + 
        geom_point(alpha=0.5) + 
        theme_bw() + 
        facet_grid(~Shrinkage) + 
        theme(strip.text.x=element_text(size=10)) + 
        ylab("Rank Difference (TPM - Count)") + 
        ggtitle(paste("Rank Difference (TPM - Count):", rankedby)) + 
        geom_hline(yintercept=0, color="black", size=0.5) + scale_color_gradient(low="blue", high="red") +
        ylim(-ylim, ylim)
}

# Print output plots
rankdiff.plot.fn(fdr.rank.df, "FDR", 100)

rankdiff.plot.fn(lfc.rank.df, "Log2FoldChange", 120)

rankdiff.plot.fn(up.lfc.rank.df, "Log2FoldChange (Increase)", 100)

rankdiff.plot.fn(down.lfc.rank.df, "Log2FoldChange (Decrease)", 75)

Distribution of rank difference in unshrunken data

- y-axis: abs(TPM-Count inputs)

- x-axis: FDR or log2FoldChange (Increase/Decrease)

# Set a function subsetting unshrunken data
unshr.rankdiff.fn <- function(df) {subset(df, Shrinkage == "None")}

# Create a list storing rankdiff data frames 
rankList <- list(unshr.rankdiff.fn(fdr.rank.df),
                 unshr.rankdiff.fn(lfc.rank.df),
                 unshr.rankdiff.fn(up.lfc.rank.df),
                 unshr.rankdiff.fn(down.lfc.rank.df))

# Assine result column as a factor with levels 
rankdiff.levels <- c("FDR", 
                     "log2FoldChange", 
                     "log2FoldChange.Increase", 
                     "log2FoldChange.Decrease")


# Name the list 
names(rankList) <- rankdiff.levels

# Create a new data frame storing rank difference by result type
rankdiff.dist <- data.frame(FDR=abs(rankList[[1]]$RankDiff), 
                            log2FoldChange=abs(rankList[[2]]$RankDiff),
                            log2FoldChange.Increase=abs(rankList[[3]]$RankDiff),
                            log2FoldChange.Decrease=abs(rankList[[4]]$RankDiff)) %>% gather(Result, RankDiff) 

rankdiff.dist$Result <- factor(rankdiff.dist$Result, levels=rankdiff.levels)

# Plot distribution of absolute rank difference
ggplot(rankdiff.dist,
       aes(x=Result, y=RankDiff, color=Result)) +
geom_jitter(alpha=0.5) + 
geom_boxplot(alpha=0.5, fill="grey", color="black") + 
theme_bw() + 
theme(axis.text.x=element_text(angle=45, hjust=1)) +
ggtitle("Distribution of Absolute Rank Difference without Shrinkage \n(TPM - Count Input)") + 
ylab("Absolute Rank Difference (TPM - Count Input)") 

Relationship between rank difference and number of transcript versions

- y-axis: abs(TPM-Count inputs)

- x-axis: number of transcripts (number of transcript id / gene id)

- dot color: mean rank

- 16 genes were missing in the plots

# Create a data frame storing the number of transcripts by gene id
AnnoDb.ntrans <- AnnoDb %>% 
    group_by(ENSEMBL) %>% 
    summarize(num.trans=n())

# Set a function adding the number of transcripts by gene id 
add.ntrans.fn <- function(df) {

    left_join(df, AnnoDb.ntrans, by=c("Gene"="ENSEMBL"))}




# Add a column indicating the number of transcripts by gene id to every rankdiff data frame
for (x in rankdiff.levels) {

    rankList[[x]] <- add.ntrans.fn(rankList[[x]])
}


# Explore the edited data frames
summary(rankList)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
head(rankList[[1]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000118523     TPM      1  3782.0311  Counts      1  3838.9322
## 2: ENSG00000181104     TPM      2  2461.7802  Counts      2  2499.2766
## 3: ENSG00000163814     TPM      3  1501.3368  Counts      3  1523.3582
## 4: ENSG00000211455     TPM      4 11847.3131  Counts      4 12017.3314
## 5: ENSG00000163762     TPM      5  1897.0253  Counts      5  1920.3664
## 6: ENSG00000138271     TPM      6   526.3565  Counts      6   533.8481
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          8.648484        0        1      None         1
## 2:          8.219169        0        2      None         2
## 3:          7.724454        0        3      None         3
## 4:          9.790094        0        4      None        13
## 5:          7.957600        0        5      None         5
## 6:          6.676177        0        6      None         2
head(rankList[[2]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.032297  Counts      1  15.382334
## 2: ENSG00000248167     TPM      2  14.754788  Counts      3  14.908009
## 3: ENSG00000198796     TPM      3  46.260638  Counts      2  44.976447
## 4: ENSG00000178776     TPM      4   7.011544  Counts      4   7.107135
## 5: ENSG00000263647     TPM      5   6.032185  Counts      5   6.125926
## 6: ENSG00000205277     TPM      6  31.318717  Counts      6  19.783206
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          3.123398        0      1.0      None         1
## 2:          3.100488       -1      2.5      None        70
## 3:          4.230460        1      2.5      None         6
## 4:          2.357557        0      4.0      None         3
## 5:          2.207741        0      5.0      None         1
## 6:          3.718689        0      6.0      None         6
head(rankList[[3]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000263647     TPM      1   6.032185  Counts      1   6.125926
## 2: ENSG00000271723     TPM      2  19.767862  Counts      2  20.267804
## 3: ENSG00000286053     TPM      3   5.058302    <NA>     NA         NA
## 4: ENSG00000261408     TPM      4   5.481395    <NA>     NA         NA
## 5: ENSG00000251177     TPM      5   8.960407  Counts      3   9.087309
## 6: ENSG00000205111     TPM      6   8.949362  Counts      5   8.562319
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          2.207741        0      1.0      None         1
## 2:          3.397917        0      2.0      None         4
## 3:                NA       NA       NA      None         4
## 4:                NA       NA       NA      None         3
## 5:          2.602991        2      4.0      None         1
## 6:          2.582526        1      5.5      None         4
head(rankList[[4]])
##               Gene Input.x Rank.x baseMean.x Input.y Rank.y baseMean.y
## 1: ENSG00000244398     TPM      1  15.032297  Counts      1  15.382334
## 2: ENSG00000248167     TPM      2  14.754788  Counts      3  14.908009
## 3: ENSG00000198796     TPM      3  46.260638  Counts      2  44.976447
## 4: ENSG00000178776     TPM      4   7.011544  Counts      4   7.107135
## 5: ENSG00000205277     TPM      5  31.318717  Counts      5  19.783206
## 6: ENSG00000137673     TPM      6 134.059247  Counts      6 136.045521
##    logMeanExpression RankDiff MeanRank Shrinkage num.trans
## 1:          3.123398        0      1.0      None         1
## 2:          3.100488       -1      2.5      None        70
## 3:          4.230460        1      2.5      None         6
## 4:          2.357557        0      4.0      None         3
## 5:          3.718689        0      5.0      None         6
## 6:          5.308674        0      6.0      None         3
# Set a function plotting rank difference vs number of transcripts 
rank.ntrans.plot.fn <- function(df, title) {

    ggplot(df, aes(x=num.trans, y=abs(RankDiff), color=MeanRank)) + 
        geom_jitter(alpha=0.5) + 
        theme_bw() + 
        ggtitle(paste("Rank Difference vs Number of Alternative Transcripts \nin", title)) + 
        xlab("Number of Alternative Transcripts") + 
        ylab("Absolute Rank Difference \n(TPM - Counts)") + scale_color_gradient(low="blue", high="red") 
}

# Print the plots
rank.ntrans.plot.fn(rankList[[1]], "FDR")

rank.ntrans.plot.fn(rankList[[2]], "log2FoldChange")

rank.ntrans.plot.fn(rankList[[3]], "log2FoldChange (Increase)")

rank.ntrans.plot.fn(rankList[[4]], "log2FoldChange (Decrease)")

Finding genes having large difference in rankings

# Initialize a list storing rankdiff genes 
large.rankdiff <- rankList

# Assign a vector storing minimum (thresholds) rankdiff for filtering large rankdiff genes
rankdiff.thr <- c(10, 10, 10, 10)

names(rankdiff.thr) <- rankdiff.levels

for (x in rankdiff.levels) {

    # Filter out observations below the rankdiff thresholds
    large.rankdiff[[x]] <- subset(rankList[[x]], 
                                      abs(RankDiff) > rankdiff.thr[x]) 

}

# Explore the filtered genes 
summary(large.rankdiff)
##                         Length Class      Mode
## FDR                     12     data.table list
## log2FoldChange          12     data.table list
## log2FoldChange.Increase 12     data.table list
## log2FoldChange.Decrease 12     data.table list
dim(large.rankdiff[[rankdiff.levels[1]]])
## [1] 65 12
dim(large.rankdiff[[rankdiff.levels[2]]])
## [1]  5 12
dim(large.rankdiff[[rankdiff.levels[3]]])
## [1]  7 12
dim(large.rankdiff[[rankdiff.levels[4]]])
## [1]  4 12

Summarizing up/down DEGs with an upset plot

- red bar: input type

- blue bar: directionality of gene expression change

# Set a function cleaning data to generate upset plots 
upset.input.fn <- function(df) {

    df <- df %>% 

        # Filter genes with valid padj 
        filter(!is.na(padj)) %>% 
        
        mutate(Up=ifelse(FDR == FDRv[1] & log2FoldChange > 0, Gene, ""), # What are upregulated genes? 
               
               Down=ifelse(FDR == FDRv[1] & log2FoldChange < 0, Gene, ""),  # What are downregulated genes? 
               
               Unchanged=ifelse(FDR == FDRv[2], Gene, ""),   # What are unchanged genes? 
               
               TPM_Input=ifelse(Input == "TPM", Gene, ""),   # What are the genes from TPM input? 
               
               Counts_Input=ifelse(Input == "Counts", Gene, ""))   # What are the genes from Counts input? 

    # Create a list storing groups of interest
    upsetInput <- list(Up=df$Up, 
                       Down=df$Down, 
                       Unchanged=df$Unchanged, 
                       TPM_Input=df$TPM, 
                       Counts_Input=df$Counts) 

    return(upsetInput)

}

upsetList <- upset.input.fn(both.df)


# Create the upset plot 
upset(fromList(upsetList), 
      sets=names(upsetList),   # What group to display 
      sets.x.label="Number of Genes per Group",
      order.by="freq",
      point.size=3,
      sets.bar.color=c("red", "red","blue", "blue", "blue"),
      text.scale = 1.5, number.angles=30) 

Session Info

sessionInfo()
## R version 4.0.3 (2020-10-10)
## Platform: x86_64-conda-linux-gnu (64-bit)
## Running under: Ubuntu 20.04.1 LTS
## 
## Matrix products: default
## BLAS/LAPACK: /home/mira/miniconda3/envs/r/lib/libopenblasp-r0.3.10.so
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats4    parallel  stats     graphics  grDevices utils     datasets 
## [8] methods   base     
## 
## other attached packages:
##  [1] AnnotationDbi_1.52.0        ashr_2.2-47                
##  [3] apeglm_1.12.0               UpSetR_1.4.0               
##  [5] gridExtra_2.3               pheatmap_1.0.12            
##  [7] DESeq2_1.30.0               SummarizedExperiment_1.20.0
##  [9] Biobase_2.50.0              MatrixGenerics_1.2.0       
## [11] matrixStats_0.57.0          GenomicRanges_1.42.0       
## [13] GenomeInfoDb_1.26.2         IRanges_2.24.0             
## [15] S4Vectors_0.28.1            tximport_1.18.0            
## [17] forcats_0.5.0               stringr_1.4.0              
## [19] dplyr_1.0.2                 purrr_0.3.4                
## [21] readr_1.4.0                 tidyr_1.1.2                
## [23] tibble_3.0.4                ggplot2_3.3.2              
## [25] tidyverse_1.3.0             AnnotationHub_2.22.0       
## [27] BiocFileCache_1.14.0        dbplyr_2.0.0               
## [29] BiocGenerics_0.36.0         rmarkdown_2.5              
## [31] data.table_1.13.4          
## 
## loaded via a namespace (and not attached):
##  [1] colorspace_2.0-0              ellipsis_0.3.1               
##  [3] XVector_0.30.0                fs_1.5.0                     
##  [5] rstudioapi_0.13               farver_2.0.3                 
##  [7] bit64_4.0.5                   mvtnorm_1.1-1                
##  [9] interactiveDisplayBase_1.28.0 fansi_0.4.1                  
## [11] lubridate_1.7.9.2             xml2_1.3.2                   
## [13] splines_4.0.3                 geneplotter_1.68.0           
## [15] knitr_1.30                    jsonlite_1.7.2               
## [17] broom_0.7.2                   annotate_1.68.0              
## [19] shiny_1.5.0                   BiocManager_1.30.10          
## [21] compiler_4.0.3                httr_1.4.2                   
## [23] backports_1.2.1               assertthat_0.2.1             
## [25] Matrix_1.2-18                 fastmap_1.0.1                
## [27] cli_2.2.0                     later_1.1.0.1                
## [29] htmltools_0.5.0               tools_4.0.3                  
## [31] coda_0.19-4                   gtable_0.3.0                 
## [33] glue_1.4.2                    GenomeInfoDbData_1.2.4       
## [35] rappdirs_0.3.1                Rcpp_1.0.5                   
## [37] bbmle_1.0.23.1                cellranger_1.1.0             
## [39] vctrs_0.3.5                   xfun_0.19                    
## [41] ps_1.5.0                      rvest_0.3.6                  
## [43] irlba_2.3.3                   mime_0.9                     
## [45] lifecycle_0.2.0               XML_3.99-0.5                 
## [47] MASS_7.3-53                   zlibbioc_1.36.0              
## [49] scales_1.1.1                  hms_0.5.3                    
## [51] promises_1.1.1                RColorBrewer_1.1-2           
## [53] yaml_2.2.1                    curl_4.3                     
## [55] memoise_1.1.0                 emdbook_1.3.12               
## [57] bdsmatrix_1.3-4               SQUAREM_2020.5               
## [59] stringi_1.5.3                 RSQLite_2.2.1                
## [61] BiocVersion_3.12.0            genefilter_1.72.0            
## [63] BiocParallel_1.24.1           truncnorm_1.0-8              
## [65] rlang_0.4.9                   pkgconfig_2.0.3              
## [67] bitops_1.0-6                  invgamma_1.1                 
## [69] evaluate_0.14                 lattice_0.20-41              
## [71] labeling_0.4.2                bit_4.0.4                    
## [73] tidyselect_1.1.0              plyr_1.8.6                   
## [75] magrittr_2.0.1                R6_2.5.0                     
## [77] generics_0.1.0                DelayedArray_0.16.0          
## [79] DBI_1.1.0                     pillar_1.4.7                 
## [81] haven_2.3.1                   withr_2.3.0                  
## [83] mixsqp_0.3-43                 survival_3.2-7               
## [85] RCurl_1.98-1.2                modelr_0.1.8                 
## [87] crayon_1.3.4                  locfit_1.5-9.4               
## [89] grid_4.0.3                    readxl_1.3.1                 
## [91] blob_1.2.1                    reprex_0.3.0                 
## [93] digest_0.6.27                 xtable_1.8-4                 
## [95] numDeriv_2016.8-1.1           httpuv_1.5.4                 
## [97] munsell_0.5.0